#include "AllShader.hpp"
const char* glsl_convlutionDepthwise_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform mediump image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform mediump sampler3D uKernel;\n"
"layout(binding=3) readonly buffer bias{\n"
"    vec4 data[];\n"
"} uBias;\n"
"layout(location=4) uniform ivec2 uPad;\n"
"layout(location=5) uniform ivec2 uKernelSize;\n"
"layout(location=6) uniform ivec2 uStride;\n"
"layout(location=7) uniform ivec2 uDilate;\n"
"// layout(location=8) uniform ivec2 uOffset;\n"
"// layout(location=9) uniform float uReluRate;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID)*ivec3(1, 1, 1);\n"
"    ivec3 outputSize = uOutputSize;\n"
"    if (all(lessThan(pos, outputSize)))\n"
"    {\n"
"        int KSIZE_Y = uKernelSize.y;\n"
"        int KSIZE_X = uKernelSize.x;\n"
"        ivec3 inputSize = uInputSize;\n"
"        ivec2 s0 = pos.xy*uStride-uPad;\n"
"        int fx, fy, fz;\n"
"        ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uDilate)));\n"
"        ivec2 efxy = min(uKernelSize, UP_DIV(inputSize.xy-s0, uDilate));\n"
"        vec4 color = uBias.data[pos.z];\n"
"        for (fy=sfxy.y; fy<efxy.y; ++fy)\n"
"        {\n"
"            int sy = fy*uDilate.y + s0.y;\n"
"            for (fx=sfxy.x; fx<efxy.x; ++fx)\n"
"            {\n"
"                int sx1 = fx*uDilate.x + s0.x;\n"
"                vec4 k = texelFetch(uKernel, ivec3(pos.z, fx, fy), 0);\n"
"                color  += k*texelFetch(uInput, ivec3(sx1, sy, pos.z), 0);\n"
"            }\n"
"        }\n"
"#ifdef RELU\n"
"        color = max(color, vec4(0));\n"
"#endif\n"
"#ifdef RELU6\n"
"        color = clamp(color, vec4(0), vec4(6));\n"
"#endif\n"
"        imageStore(uOutput, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_softmaxWidth_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform int w;\n"
"layout(location=3) uniform int h;\n"
"layout(location=4) uniform int c;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    // input tensor's layout is NC4HW4\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    \n"
"    int channelDiv4 = (c + 3) / 4;\n"
"    int HW = w * h;\n"
"    \n"
"    if(pos.y < h && pos.z < channelDiv4)\n"
"    {\n"
"        // get the max value\n"
"        vec4 maxValue = vec4(-1000.0);\n"
"        for(int i = 0; i < w; ++i)\n"
"        {\n"
"            maxValue = max(maxValue, texelFetch(uInput, ivec3(i, pos.y, pos.z), 0));\n"
"        }\n"
"        \n"
"        // sum\n"
"        vec4 sum = vec4(0.0);\n"
"        for(int i = 0; i < w; ++i)\n"
"        {\n"
"            sum += exp(texelFetch(uInput, ivec3(i, pos.y, pos.z), 0) - maxValue);\n"
"        }\n"
"        // div\n"
"        for(int i = 0; i < w; ++i)\n"
"        {\n"
"            ivec3 curPos = ivec3(i, pos.y, pos.z);\n"
"            imageStore(uOutput, curPos, exp(texelFetch(uInput, ivec3(i, pos.y, pos.z), 0) - maxValue) / sum);\n"
"        }\n"
"        \n"
"    }\n"
"}\n"
;
const char* glsl_softmaxChannel_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform int w;\n"
"layout(location=3) uniform int h;\n"
"layout(location=4) uniform int c;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    // input tensor's layout is NC4HW4\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    \n"
"    if(pos.x < w && pos.y < h)\n"
"    {\n"
"        int channelDiv4 = c / 4;\n"
"        int upDiv4 = (c + 3) / 4;\n"
"        int lastChannel = c % 4;\n"
"        int batchIndex = pos.z * upDiv4;\n"
"        // get the max value\n"
"        vec4 maxValue = vec4(-1000.0);\n"
"        for(int i = 0; i < channelDiv4; ++i)\n"
"        {\n"
"            maxValue = max(maxValue, texelFetch(uInput, ivec3(pos.x, pos.y, i + batchIndex), 0));\n"
"        }\n"
"        // get the true max vaule\n"
"        float maxValueTrue = -1000.0;\n"
"        \n"
"        maxValueTrue = max(maxValue.x, maxValue.y);\n"
"        maxValueTrue = max(maxValueTrue, maxValue.z);\n"
"        maxValueTrue = max(maxValueTrue, maxValue.w);\n"
"        \n"
"        if(lastChannel == 1)\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            maxValueTrue = max(maxValueTrue, tempData.x);\n"
"        }\n"
"        else if(lastChannel == 2)\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            maxValueTrue = max(maxValueTrue, tempData.x);\n"
"            maxValueTrue = max(maxValueTrue, tempData.y);\n"
"        }\n"
"        else\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            maxValueTrue = max(maxValueTrue, tempData.x);\n"
"            maxValueTrue = max(maxValueTrue, tempData.y);\n"
"            maxValueTrue = max(maxValueTrue, tempData.z);\n"
"        }\n"
"        \n"
"        // exp\n"
"        maxValue = vec4(maxValueTrue);\n"
"        vec4 sum = vec4(0.0);\n"
"        for(int i = 0; i < channelDiv4; ++i)\n"
"        {\n"
"            sum += exp(texelFetch(uInput, ivec3(pos.x, pos.y, i + batchIndex), 0) - maxValue);\n"
"        }\n"
"        \n"
"        float sumTrue = 0.0;\n"
"        sumTrue = sum.x + sum.y + sum.z + sum.w;\n"
"        \n"
"        if(lastChannel == 1)\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            sumTrue += exp(tempData.x - maxValueTrue);\n"
"        }\n"
"        else if(lastChannel == 2)\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            sumTrue += (exp(tempData.x - maxValueTrue) + exp(tempData.y - maxValueTrue));\n"
"        }\n"
"        else\n"
"        {\n"
"            vec4 tempData = texelFetch(uInput, ivec3(pos.x, pos.y, channelDiv4 + batchIndex), 0);\n"
"            sumTrue += (exp(tempData.x - maxValueTrue) + exp(tempData.y - maxValueTrue) + exp(tempData.z - maxValueTrue));\n"
"        }\n"
"        \n"
"        // div sum\n"
"        sum = vec4(sumTrue);\n"
"        for(int i = 0; i < upDiv4; ++i)\n"
"        {\n"
"            ivec3 curPos = ivec3(pos.x, pos.y, i + batchIndex);\n"
"            imageStore(uOutput, curPos, exp(texelFetch(uInput, curPos, 0) - maxValue) / sum);\n"
"        }\n"
"        \n"
"    }\n"
"}\n"
;
const char* glsl_eltwise_glsl = 
"layout(FORMAT, binding=1) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout (local_size_x = 2, local_size_y = 2, local_size_z = 16) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 outputSize = uOutputSize;\n"
"    if (all(lessThan(pos, outputSize)))\n"
"    {\n"
"        vec4 color = MAINOP(pos);\n"
"        imageStore(uOutput, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_gemm16x16_glsl = 
"layout(std430) buffer;\n"
"layout(binding=0, FORMAT) writeonly mediump uniform image2D uOutput;\n"
"layout(binding=1, FORMAT) readonly mediump uniform image2D uInput;\n"
"layout(binding=2, FORMAT) readonly mediump uniform image2D uKernel;\n"
"layout(location=3) uniform ivec2 outputSize;\n"
"layout(location=4) uniform int ic_4;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"//index : 1, oc/4, (ob*oh*ow)/4\n"
"//outputsize :  oc/4, (ob*oh*ow)/4\n"
"//multiLength : ci/4\n"
"//kernel image : oc/4, ic/4 * ic4  * oc4\n"
"//input : temp image : (ib*oh*ow)/ 4, ic/4*(ib*oh*ow)%4*ic4\n"
"//output : temp image : oc/4 * (ob*oh*ow)%4, (ob*oh*ow)/4 * oc4\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID); // 1, oc/4, (ob*oh*ow)/4\n"
"    int oc_4 = pos.y;\n"
"    int obxohxow_4 = pos.x;\n"
"    if (obxohxow_4 < outputSize.x && oc_4 < outputSize.y)\n"
"    {\n"
"        vec4 o0 = vec4(0);\n"
"        vec4 o1 = vec4(0);\n"
"        vec4 o2 = vec4(0);\n"
"        vec4 o3 = vec4(0);\n"
"        for (int k=0; k<ic_4; ++k)\n"
"        {\n"
"            int k4 = k << 2;\n"
"            vec4 k0 = imageLoad(uKernel, ivec2(k4, oc_4));\n"
"            vec4 s0 = imageLoad(uInput, ivec2(k4++, obxohxow_4));\n"
"            vec4 k1 = imageLoad(uKernel, ivec2(k4, oc_4));\n"
"            vec4 s1 = imageLoad(uInput, ivec2(k4++, obxohxow_4));\n"
"            vec4 k2 = imageLoad(uKernel, ivec2(k4, oc_4));\n"
"            vec4 s2 = imageLoad(uInput, ivec2(k4++, obxohxow_4));\n"
"            vec4 k3 = imageLoad(uKernel, ivec2(k4, oc_4));\n"
"            vec4 s3 = imageLoad(uInput, ivec2(k4, obxohxow_4));\n"
"            mat4 kernel_mat = mat4(k0, k1, k2, k3);\n"
"            o0 += kernel_mat * s0;\n"
"            o1 += kernel_mat * s1;\n"
"            o2 += kernel_mat * s2;\n"
"            o3 += kernel_mat * s3;\n"
"        }\n"
"        int oc_44 = oc_4 << 2;\n"
"        imageStore(uOutput, ivec2(obxohxow_4, oc_44++), o0);\n"
"        imageStore(uOutput, ivec2(obxohxow_4, oc_44++), o1);\n"
"        imageStore(uOutput, ivec2(obxohxow_4, oc_44++), o2);\n"
"        imageStore(uOutput, ivec2(obxohxow_4, oc_44++), o3);\n"
"        \n"
"    }\n"
"}\n"
;
const char* glsl_preluWithChannel_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(binding=2) readonly buffer slope{\n"
"    vec4 data[];\n"
"} uSlope;\n"
"layout(location=3) uniform ivec4 imgSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 imgSize = imgSize.xyz;\n"
"    if(pos.x < imgSize.x && pos.y < imgSize.y && pos.z < imgSize.z)\n"
"    {\n"
"        vec4 slope = uSlope.data[pos.z];\n"
"        vec4 dataIn = texelFetch(uInput, pos, 0);\n"
"        vec4 dataTemp = dataIn * slope;\n"
"        bvec4 lessZero = bvec4(lessThan(dataIn, vec4(0.0)));\n"
"        imageStore(uOutput, pos, mix(dataIn, dataTemp, lessZero));\n"
"    }\n"
"    \n"
"}\n"
;
const char* glsl_image_copy_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform ivec4 imgSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 imgSize = imgSize.xyz;\n"
"    if(pos.x < imgSize.x && pos.y < imgSize.y)\n"
"    {\n"
"        vec4 dataIn =  texelFetch(uInput, pos, 0);\n"
"        imageStore(uOutput, pos, dataIn);\n"
"    }\n"
"}\n"
;
const char* glsl_kernel2image_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image2D uOutput;\n"
"layout(binding=2) readonly buffer kernel{\n"
"    vec4 data[];\n"
"} uKernel;\n"
"layout(location = 3) uniform int width;\n"
"layout(location = 4) uniform int height;\n"
"//index : ky * kx, oc/4, ic/4\n"
"//kernel buffer : oc ic h w -> oc/4 ic/4 ky kx ic4 oc4\n"
"//kernel image : oc/4, ky * kx * ic/4 * ic4\n"
"layout (local_size_x = 4, local_size_y = 4, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < width && pos.y < height)\n"
"    {\n"
"        vec4 res = uKernel.data[pos.x+pos.y*width];\n"
"        imageStore(uOutput, ivec2(pos.x, pos.y), res);\n"
"    }\n"
"}\n"
;
const char* glsl_convolution1x1_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform mediump sampler3D uKernel;\n"
"layout(binding=3) readonly buffer bias{\n"
"    vec4 data[];\n"
"} uBias;\n"
"layout(location=8) uniform int uUnroll;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 outputSize = uOutputSize;\n"
"    if (all(lessThan(ivec3(gl_GlobalInvocationID), outputSize)))\n"
"    {\n"
"        ivec3 pos = ivec3(gl_GlobalInvocationID)*ivec3(uUnroll, 1, 1);\n"
"        ivec3 inputSize = uInputSize;\n"
"        int sy = pos.y;\n"
"        int sx = pos.x;\n"
"        int fx, fy, fz;\n"
"        vec4 color = uBias.data[pos.z];\n"
"        vec4 color2 = color;\n"
"        vec4 color3 = color;\n"
"        vec4 color4 = color;\n"
"        int kernelY = pos.z;\n"
"        for (fz=0; fz<inputSize.z; ++fz)\n"
"        {\n"
"            int kernelX = 4*fz;\n"
"            vec4 k0 = texelFetch(uKernel, ivec3(kernelX+0, kernelY, 0), 0);\n"
"            vec4 k1 = texelFetch(uKernel, ivec3(kernelX+1, kernelY, 0), 0);\n"
"            vec4 k2 = texelFetch(uKernel, ivec3(kernelX+2, kernelY, 0), 0);\n"
"            vec4 k3 = texelFetch(uKernel, ivec3(kernelX+3, kernelY, 0), 0);\n"
"            \n"
"            mat4 k = mat4(k0, k1, k2, k3);\n"
"            \n"
"            color  += k*texelFetch(uInput, ivec3(sx+0, sy, fz), 0);\n"
"            color2 += k*texelFetch(uInput, ivec3(sx+1, sy, fz), 0);\n"
"            color3 += k*texelFetch(uInput, ivec3(sx+2, sy, fz), 0);\n"
"            color4 += k*texelFetch(uInput, ivec3(sx+3, sy, fz), 0);\n"
"        }\n"
"        #ifdef RELU\n"
"        color = max(color, vec4(0));\n"
"        color2 = max(color2, vec4(0));\n"
"        color3 = max(color3, vec4(0));\n"
"        color4 = max(color4, vec4(0));\n"
"        #endif\n"
"        #ifdef RELU6\n"
"        color = clamp(color, vec4(0), vec4(6));\n"
"        color2 = clamp(color2, vec4(0), vec4(6));\n"
"        color3 = clamp(color3, vec4(0), vec4(6));\n"
"        color4 = clamp(color4, vec4(0), vec4(6));\n"
"        #endif\n"
"        imageStore(uOutput, ivec3(pos.x+0, pos.y, pos.z), color);\n"
"        imageStore(uOutput, ivec3(pos.x+1, pos.y, pos.z), color2);\n"
"        imageStore(uOutput, ivec3(pos.x+2, pos.y, pos.z), color3);\n"
"        imageStore(uOutput, ivec3(pos.x+3, pos.y, pos.z), color4);\n"
"    }\n"
"    \n"
"}\n"
;
const char* glsl_col2im_glsl = 
"layout(std430) buffer;\n"
"layout(binding=0, FORMAT) writeonly uniform mediump image3D uOutput;\n"
"layout(location=1) uniform mediump sampler2D uInput;\n"
"layout(binding=2) readonly buffer bias{\n"
"    vec4 data[];\n"
"} uBias;\n"
"layout(location=3) uniform ivec3 outputSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"//index : ob*oc/4, oh, ow\n"
"//outputsize : oc/4, oh, ow\n"
"//input temp image : oc/4 * (ob*oh*ow)%4, (ob*oh*ow)/4 * oc4\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    int ob = pos.z / outputSize.z;\n"
"    int oc_4 = pos.z % outputSize.z;\n"
"    if (all(lessThan(pos.xy, outputSize.xy)))\n"
"    {\n"
"        int sourceXIndex = ob*outputSize.x*outputSize.y + pos.y*outputSize.x + pos.x;\n"
"        int sourceX = sourceXIndex / 4;\n"
"        int sourceY = oc_4 * 4 + sourceXIndex % 4;\n"
"        vec4 color = uBias.data[pos.z];\n"
"        color += texelFetch(uInput, ivec2(sourceX, sourceY), 0);\n"
"#ifdef RELU\n"
"        color = max(color, vec4(0));\n"
"#endif\n"
"#ifdef RELU6\n"
"        color = clamp(color, vec4(0), vec4(6));\n"
"#endif\n"
"        imageStore(uOutput, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_avgpool_glsl = 
"layout(FORMAT, binding=0, location=0) readonly uniform PRECISION image3D uInput;\n"
"layout(FORMAT, binding=1, location=1) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location = 2) uniform ivec2 uKernel;\n"
"layout(location = 3) uniform ivec2 uStride;\n"
"layout(location = 4) uniform ivec2 uPad;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"layout (local_size_x = 2, local_size_y = 2, local_size_z = 16) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 outputSize = uOutputSize;\n"
"    ivec2 spos = pos.xy*uStride-uPad;\n"
"    if (all(lessThan(pos, outputSize)))\n"
"    {\n"
"        ivec2 inputSizeXY = uInputSize.xy;\n"
"        vec4 color = vec4(0.0);\n"
"        vec4 num = vec4(0.0);\n"
"        ivec2 sfxy = max(ivec2(0), -spos);\n"
"        ivec2 efxy = min(uKernel, inputSizeXY-spos);\n"
"        for (int fy=sfxy.y; fy<efxy.y; ++fy)\n"
"        {\n"
"            for (int fx=sfxy.x; fx<efxy.x; ++fx)\n"
"            {\n"
"                ivec2 spos_ = spos + ivec2(fx, fy);\n"
"                color += imageLoad(uInput, ivec3(spos.x+fx, spos.y+fy, pos.z));\n"
"                num += vec4(1.0);\n"
"            }\n"
"        }\n"
"        imageStore(uOutput, pos, color/num);\n"
"    }\n"
"}\n"
;
const char* glsl_maxpool_glsl = 
"layout(FORMAT, binding=0) readonly uniform PRECISION image3D uInput;\n"
"layout(FORMAT, binding=1) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location = 2) uniform ivec2 uKernel;\n"
"layout(location = 3) uniform ivec2 uStride;\n"
"layout(location = 4) uniform ivec2 uPad;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"layout (local_size_x = 2, local_size_y = 2, local_size_z = 16) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 outputSize = uOutputSize;\n"
"    ivec2 spos = pos.xy*uStride-uPad;\n"
"    if (all(lessThan(pos, outputSize)))\n"
"    {\n"
"        ivec3 inputSize = uInputSize;\n"
"        ivec2 sfxy = max(ivec2(0), -spos);\n"
"        ivec2 efxy = min(uKernel, inputSize.xy-spos);\n"
"        vec4 color = vec4(-100000.0);\n"
"        for (int fy=sfxy.y; fy<efxy.y; ++fy)\n"
"        {\n"
"            for (int fx=sfxy.x; fx<efxy.x; ++fx)\n"
"            {\n"
"                color = max(color, imageLoad(uInput, ivec3(spos.x+fx, spos.y+fy, pos.z)));\n"
"            }\n"
"        }\n"
"        imageStore(uOutput, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_im2col1x1_glsl = 
"layout(std430) buffer;\n"
"layout(binding=0, FORMAT) writeonly mediump uniform image2D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=5) uniform int ic_4;\n"
"layout(location=6) uniform int outputWidth;\n"
"layout(location=7) uniform int outputHeight;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"//index : ib*ic/4, oh, ow\n"
"//input image ic/4, ih, iw * ic4\n"
"//output : temp image : (ib*oh*ow)/ 4, ic/4*(ib*oh*ow)%4*ic4\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < outputWidth && pos.y < outputHeight)\n"
"    {\n"
"        int ic_4_i = pos.z % ic_4;\n"
"        int ib_i = pos.z / ic_4;\n"
"        int destYOrigin = ib_i*outputWidth*outputHeight + pos.y*outputWidth + pos.x;\n"
"        int destY = destYOrigin / 4;\n"
"        int destXOffset = destYOrigin % 4;\n"
"        vec4 color = texelFetch(uInput, ivec3(pos.x, pos.y, pos.z), 0);\n"
"        imageStore(uOutput, ivec2(ic_4_i*4+destXOffset, destY), color);\n"
"    }\n"
"}\n"
;
const char* glsl_resizeBilinear_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform ivec4 inImgSize;\n"
"layout(location=3) uniform ivec4 outImgSize;\n"
"layout(location=4) uniform vec2 scale;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 inputImgSize = inImgSize.xyz;\n"
"    ivec3 outputImgSize = outImgSize.xyz;\n"
"    \n"
"    if(pos.x < outputImgSize.x && pos.y < outputImgSize.y && pos.z < outputImgSize.z)\n"
"    {\n"
"        float srcX = float(pos.x) * scale.x;\n"
"        int x1 = int(floor(srcX));\n"
"        int x11 = clamp(x1, 0, inputImgSize.x - 1);\n"
"        int x12 = clamp(x1 + 1, 0, inputImgSize.x - 1);\n"
"        vec4 factorX = vec4(srcX - float(x1));\n"
"        float srcY = float(pos.y) * scale.y;\n"
"        int y1 = int(floor(srcY));\n"
"        int y11 = clamp(y1, 0, inputImgSize.y - 1);\n"
"        int y12 = clamp(y1 + 1, 0, inputImgSize.y - 1);\n"
"        vec4 factorY = vec4(srcY - float(y1));\n"
"        vec4 res1 = texelFetch(uInput, ivec3(x11, y12, pos.z), 0);\n"
"        vec4 res2 = texelFetch(uInput, ivec3(x12, y12, pos.z), 0);\n"
"        vec4 res3 = texelFetch(uInput, ivec3(x11, y11, pos.z), 0);\n"
"        vec4 res4 = texelFetch(uInput, ivec3(x12, y11, pos.z), 0);\n"
"        vec4 res11 = (vec4(1.0) - factorX) * res1 + factorX * res2;\n"
"        vec4 res12 = (vec4(1.0) - factorX) * res3 + factorX * res4;\n"
"        \n"
"        imageStore(uOutput, pos, factorY * res11 + (vec4(1.0) - factorY) * res12);\n"
"    }\n"
"    \n"
"}\n"
;
const char* glsl_unary_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput0;\n"
"layout(location=3) uniform ivec4 imgSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 inSize = imgSize.xyz;\n"
"    if(all(lessThan(pos, inSize)))\n"
"    {\n"
"        vec4 data = texelFetch(uInput0, pos, 0);\n"
"#ifdef EXP\n"
"        vec4 sum = exp(data);\n"
"#endif\n"
"        imageStore(uOutput, pos, sum);\n"
"    }\n"
"}\n"
;
const char* glsl_resizeNearest_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform ivec4 inImgSize;\n"
"layout(location=3) uniform ivec4 outImgSize;\n"
"layout(location=4) uniform vec2 scale;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    // input output layout is NC4HW4\n"
"    \n"
"    ivec3 inputImgSize = inImgSize.xyz;\n"
"    ivec3 outputImgSize = outImgSize.xyz;\n"
"    \n"
"    if(pos.x < outputImgSize.x && pos.y < outputImgSize.y)\n"
"    {\n"
"        float srcX = float(pos.x) * scale.x;\n"
"        int x1 = int(floor(srcX));\n"
"        int x11 = clamp(x1, 0, inputImgSize.x - 1);\n"
"        \n"
"        float srcY = float(pos.y) * scale.y;\n"
"        int y1 = int(floor(srcY));\n"
"        int y11 = clamp(y1, 0, inputImgSize.y - 1);\n"
"        \n"
"        vec4 outValue = texelFetch(uInput, ivec3(x11, y11, pos.z), 0);\n"
"        \n"
"        imageStore(uOutput, pos, outValue);\n"
"    }\n"
"    \n"
"}\n"
;
const char* glsl_converter_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location = 2) uniform int width;\n"
"layout(location = 3) uniform int height;\n"
"layout(location = 4) uniform int channel;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < width && pos.y < height && pos.z < channel)\n"
"    {\n"
"        vec4 result = texelFetch(uInput, pos, 0);\n"
"        imageStore(uOutput, pos, result);\n"
"    }\n"
"}\n"
;
const char* glsl_roiPooling_glsl = 
"layout(location=0) uniform mediump sampler3D uInput;\n"
"layout(FORMAT, binding=1) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=2) uniform mediump sampler3D uRoI;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"layout(location=12) uniform float spatialScale;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if(pos.x < uOutputSize.x && pos.y < uOutputSize.y)\n"
"    {\n"
"        ivec3 uInputSize = uInputSize.xyz;\n"
"        int roiBatchIndex = pos.z / uInputSize.z;\n"
"        int inputZIndex = pos.z % uInputSize.z;\n"
"        // 0, xmin, ymin, xmax, ymax\n"
"        vec4 roiData0 = texelFetch(uRoI, ivec3(0, 0, roiBatchIndex), 0);\n"
"        vec4 roiData1 = texelFetch(uRoI, ivec3(0, 0, roiBatchIndex + 1), 0);\n"
"        int x1 = int(round(float(roiData0.y) * spatialScale));\n"
"        int y1 = int(round(float(roiData0.z) * spatialScale));\n"
"        int x2 = int(round(float(roiData0.w) * spatialScale));\n"
"        int y2 = int(round(float(roiData1.x) * spatialScale));\n"
"        int roiW = max(x2 - x1 + 1, 1);\n"
"        int roiH = max(y2 - y1 + 1, 1);\n"
"        float binSizeW = float(roiW) / float(uOutputSize.x);\n"
"        float binSizeH = float(roiH) / float(uOutputSize.y);\n"
"        int wStart = clamp(x1 + int(floor(float(pos.x) * binSizeW)), 0, uInputSize.x);\n"
"        int wEnd = clamp(x1 + int(ceil(float(pos.x + 1) * binSizeW)), 0, uInputSize.x);\n"
"        int hStart = clamp(y1 + int(floor(float(pos.y) * binSizeH)), 0, uInputSize.y);\n"
"        int hEnd = clamp(y1 + int(ceil(float(pos.y + 1) * binSizeH)), 0, uInputSize.y);\n"
"        bool isEmpty = (hEnd <= hStart) || (wEnd <= wStart);\n"
"        vec4 res = isEmpty ? vec4(0.0) : texelFetch(uInput, ivec3(0, 0, inputZIndex), 0);\n"
"        for(int i = hStart; i < hEnd; ++i)\n"
"        {\n"
"            for(int j = wStart; j < wEnd; ++j)\n"
"            {\n"
"                res = max(res, texelFetch(uInput, ivec3(j, i, inputZIndex), 0));\n"
"            }\n"
"        }\n"
"        imageStore(uOutput, pos, res);\n"
"    }\n"
"}\n"
;
const char* glsl_blit_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(FORMAT, binding=1) readonly uniform PRECISION image3D uInput;\n"
"layout(location = 2) uniform ivec3 uSourceOffset;\n"
"layout(location = 3) uniform ivec3 uDestOffset;\n"
"layout(location = 4) uniform ivec3 uBlitSize;\n"
"layout (local_size_x = 4, local_size_y = 4, local_size_z = 4) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (all(lessThan(pos, uBlitSize)))\n"
"    {\n"
"        ivec3 dstP = uDestOffset + pos;\n"
"        ivec3 srcP = uSourceOffset + pos;\n"
"        imageStore(uOutput, dstP, imageLoad(uInput, srcP));\n"
"    }\n"
"}\n"
;
const char* glsl_kernel2ImageDepthwise_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(binding=2) readonly buffer kernel{\n"
"    float data[];\n"
"} uKernel;\n"
"layout(location = 3) uniform int uFx;\n"
"layout(location = 4) uniform int uFy;\n"
"layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    int fx = pos.y;\n"
"    int fy = pos.z;\n"
"    int z0 = pos.x * 4 + 0;\n"
"    int z1 = pos.x * 4 + 1;\n"
"    int z2 = pos.x * 4 + 2;\n"
"    int z3 = pos.x * 4 + 3;\n"
"    int p0 = z0*uFx*uFy + fy*uFx + fx;\n"
"    int p1 = z1*uFx*uFy + fy*uFx + fx;\n"
"    int p2 = z2*uFx*uFy + fy*uFx + fx;\n"
"    int p3 = z3*uFx*uFy + fy*uFx + fx;\n"
"    vec4 color = vec4(\n"
"        uKernel.data[p0],\n"
"        uKernel.data[p1],\n"
"        uKernel.data[p2],\n"
"        uKernel.data[p3]    \n"
"    );\n"
"    imageStore(uOutput, pos, color);\n"
"}\n"
;
const char* glsl_clear_texture_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image2D uOutput;\n"
"layout(location = 1) uniform int width;\n"
"layout(location = 2) uniform int height;\n"
"layout (local_size_x = 4, local_size_y = 4, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < width && pos.y < height)\n"
"    {\n"
"        imageStore(uOutput, ivec2(pos.x, pos.y), vec4(0,0,0,0));\n"
"    }\n"
"}\n"
;
const char* glsl_permute_glsl = 
"layout(binding = 0) readonly buffer srcBuffer{\n"
"    float data[];\n"
"}uInput;\n"
"layout(binding = 1) writeonly buffer dstBuffer{\n"
"    float data[];\n"
"}uOutput;\n"
"layout(location=2) uniform ivec4 dims;\n"
"layout(location=3) uniform ivec4 inImSize;\n"
"layout(location=4) uniform ivec4 outImSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 inImgSize = ivec3(inImSize.xyz);\n"
"    ivec3 outImgSize = ivec3(outImSize.xyz);\n"
"    // input, output all are NCHW layout\n"
"    ivec4 dimParam = dims.xyzw;\n"
"    if(pos.x < outImgSize.x && pos.y < outImgSize.y)\n"
"    {\n"
"        int dimIndex[4];\n"
"        \n"
"        dimIndex[dimParam.y] = pos.z;\n"
"        dimIndex[dimParam.z] = pos.y;\n"
"        dimIndex[dimParam.w] = pos.x;\n"
"        int inputIndex = dimIndex[1] * inImgSize.x * inImgSize.y + dimIndex[2] * inImgSize.x + dimIndex[3];\n"
"        int outputIndex = pos.x + pos.y * outImgSize.x + pos.z * outImgSize.x * outImgSize.y;\n"
"        uOutput.data[outputIndex] = uInput.data[inputIndex];\n"
"    }\n"
"}\n"
;
const char* glsl_image_to_nchw_buffer_glsl = 
"layout(FORMAT, binding=0) readonly uniform PRECISION image3D uImage;\n"
"layout(binding=1) writeonly buffer destBuffer{\n"
"    float data[];\n"
"} uOutBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color = imageLoad(uImage, pos);\n"
"        int z = pos.z*4;\n"
"        uOutBuffer.data[uWidth*pos.y+pos.x+(z+0)*uWidth*uHeight] = color.r;\n"
"        uOutBuffer.data[uWidth*pos.y+pos.x+(z+1)*uWidth*uHeight] = color.g;\n"
"        uOutBuffer.data[uWidth*pos.y+pos.x+(z+2)*uWidth*uHeight] = color.b;\n"
"        uOutBuffer.data[uWidth*pos.y+pos.x+(z+3)*uWidth*uHeight] = color.a;\n"
"    }\n"
"}\n"
;
const char* glsl_convolution_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform mediump sampler3D uKernel;\n"
"layout(binding=3) readonly buffer bias{\n"
"    vec4 data[];\n"
"} uBias;\n"
"layout(location=4) uniform ivec2 uPad;\n"
"layout(location=5) uniform ivec2 uKernelSize;\n"
"layout(location=6) uniform ivec2 uStride;\n"
"layout(location=7) uniform ivec2 uDilate;\n"
"layout(location=8) uniform int uUnroll;\n"
"layout(location=10) uniform ivec3 uOutputSize;\n"
"layout(location=11) uniform ivec3 uInputSize;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"//weight : oc ic h w -> oc/4, ic/4, ky kx ic4 oc4\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    if (all(lessThan(ivec3(gl_GlobalInvocationID), uOutputSize)))\n"
"    {\n"
"        ivec3 pos = ivec3(gl_GlobalInvocationID)*ivec3(uUnroll, 1, 1);\n"
"        int kernelX = uKernelSize.x;\n"
"        ivec3 inputSize = uInputSize;\n"
"        ivec2 s0 = pos.xy*uStride-uPad;\n"
"        int fx, fy, fz;\n"
"        ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uDilate)));\n"
"        ivec2 efxy = min(uKernelSize, UP_DIV(inputSize.xy-s0, uDilate));\n"
"        vec4 color = uBias.data[pos.z];\n"
"        vec4 color2 = color;\n"
"        vec4 color3 = color;\n"
"        vec4 color4 = color;\n"
"        int kernelY = pos.z;\n"
"        for (fy=sfxy.y; fy<efxy.y; ++fy)\n"
"        {\n"
"            int sy = fy*uDilate.y + s0.y;\n"
"            for (fx=0; fx<kernelX; ++fx)\n"
"            {\n"
"                int kernelZ = fx + fy*kernelX;\n"
"                int sx1 = fx*uDilate.x + s0.x;\n"
"                int sx2 = sx1 + uStride.x;\n"
"                int sx3 = sx1 + uStride.x * 2;\n"
"                int sx4 = sx1 + uStride.x * 3;\n"
"                float m1 = sx1 >= 0&& sx1 < inputSize.x ? 1.0 : 0.0;\n"
"                float m2 = sx2 >= 0&& sx2 < inputSize.x ? 1.0 : 0.0;\n"
"                float m3 = sx3 >= 0&& sx3 < inputSize.x ? 1.0 : 0.0;\n"
"                float m4 = sx4 >= 0&& sx4 < inputSize.x ? 1.0 : 0.0;\n"
"                fz = 0;\n"
"                for (; fz<inputSize.z; ++fz)\n"
"                {\n"
"                    int kernelX = 4*fz;\n"
"                    vec4 k0 = texelFetch(uKernel, ivec3(kernelX+0, kernelY, kernelZ), 0);\n"
"                    vec4 k1 = texelFetch(uKernel, ivec3(kernelX+1, kernelY, kernelZ), 0);\n"
"                    vec4 k2 = texelFetch(uKernel, ivec3(kernelX+2, kernelY, kernelZ), 0);\n"
"                    vec4 k3 = texelFetch(uKernel, ivec3(kernelX+3, kernelY, kernelZ), 0);\n"
"                    \n"
"                    mat4 k = mat4(k0, k1, k2, k3);\n"
"                    \n"
"                    color  += k*texelFetch(uInput, ivec3(sx1, sy, fz), 0) * m1;\n"
"                    color2 += k*texelFetch(uInput, ivec3(sx2, sy, fz), 0) * m2;\n"
"                    color3 += k*texelFetch(uInput, ivec3(sx3, sy, fz), 0) * m3;\n"
"                    color4 += k*texelFetch(uInput, ivec3(sx4, sy, fz), 0) * m4;\n"
"                }\n"
"            }\n"
"        }\n"
"        #ifdef RELU\n"
"        color = max(color, vec4(0));\n"
"        color2 = max(color2, vec4(0));\n"
"        color3 = max(color3, vec4(0));\n"
"        color4 = max(color4, vec4(0));\n"
"        #endif\n"
"        #ifdef RELU6\n"
"        color = clamp(color, vec4(0), vec4(6));\n"
"        color2 = clamp(color2, vec4(0), vec4(6));\n"
"        color3 = clamp(color3, vec4(0), vec4(6));\n"
"        color4 = clamp(color4, vec4(0), vec4(6));\n"
"        #endif\n"
"        imageStore(uOutput, ivec3(pos.x+0, pos.y, pos.z), color);\n"
"        imageStore(uOutput, ivec3(pos.x+1, pos.y, pos.z), color2);\n"
"        imageStore(uOutput, ivec3(pos.x+2, pos.y, pos.z), color3);\n"
"        imageStore(uOutput, ivec3(pos.x+3, pos.y, pos.z), color4);\n"
"    }\n"
"    \n"
"}\n"
;
const char* glsl_kernel2image_adreno_glsl = 
"layout(std430) buffer;\n"
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(binding=2) readonly buffer kernel{\n"
"    vec4 data[];\n"
"} uKernel;\n"
"layout(location = 3) uniform int uFxFy;\n"
"layout(location = 4) uniform int uIc_4;\n"
"layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
"//weight buffer : oc ic h w -> oc/4, ic/4, ky kx ic4 oc4\n"
"//index : ky kx, oc/4, ic/4\n"
"//weight image : ky kx, oc/4, ic/4*ic4 oc4\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID) * ivec3(4, 1, 1);\n"
"    int kernelPos = 0\n"
"    + pos.x * uFxFy\n"
"    + 4*pos.y * uIc_4 * uFxFy\n"
"    + 4*pos.z\n"
"    ;\n"
"    vec4 color0 = uKernel.data[kernelPos+0];\n"
"    vec4 color1 = uKernel.data[kernelPos+1];\n"
"    vec4 color2 = uKernel.data[kernelPos+2];\n"
"    vec4 color3 = uKernel.data[kernelPos+3];\n"
"    \n"
"    imageStore(uOutput, ivec3(pos.x+0, pos.y, pos.z), color0);\n"
"    imageStore(uOutput, ivec3(pos.x+1, pos.y, pos.z), color1);\n"
"    imageStore(uOutput, ivec3(pos.x+2, pos.y, pos.z), color2);\n"
"    imageStore(uOutput, ivec3(pos.x+3, pos.y, pos.z), color3);\n"
"}\n"
;
const char* glsl_binary_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput0;\n"
"layout(location=2) uniform mediump sampler3D uInput1;\n"
"layout(location=3) uniform ivec4 imgSize;\n"
"layout(location=4) uniform int activationType;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 inSize = imgSize.xyz;\n"
"    if(all(lessThan(pos, inSize)))\n"
"    {\n"
"#ifdef ADD\n"
"        vec4 sum = texelFetch(uInput0, pos, 0) + texelFetch(uInput1, pos, 0);\n"
"#endif\n"
"#ifdef MUL\n"
"        vec4 sum = texelFetch(uInput0, pos, 0) * texelFetch(uInput1, pos, 0);\n"
"#endif\n"
"#ifdef SUB\n"
"        vec4 sum = texelFetch(uInput0, pos, 0) - texelFetch(uInput1, pos, 0);\n"
"#endif\n"
"#ifdef REALDIV\n"
"        vec4 sum = texelFetch(uInput0, pos, 0) / texelFetch(uInput1, pos, 0);\n"
"#endif\n"
"        if(activationType == 1) {\n"
"            sum = max(sum, vec4(0));\n"
"        }\n"
"        imageStore(uOutput, pos, sum);\n"
"    }\n"
"}\n"
;
const char* glsl_relu_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform ivec4 imgSize;\n"
"layout(location=3) uniform float slope;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    ivec3 imgSize = imgSize.xyz;\n"
"    if(pos.x < imgSize.x && pos.y < imgSize.y)\n"
"    {\n"
"        vec4 dataIn =  texelFetch(uInput, pos, 0);\n"
"        bvec4 lessZero = bvec4(lessThan(dataIn, vec4(0.0)));\n"
"        vec4 dataTemp = dataIn * vec4(slope);\n"
"        imageStore(uOutput, pos, mix(dataIn, dataTemp, lessZero));\n"
"    }\n"
"}\n"
;
const char* glsl_nc4hw4_buffer_to_image_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uImage;\n"
"layout(binding=1) readonly buffer destBuffer{\n"
"    vec4 data[];\n"
"} uInBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color = uInBuffer.data[uWidth*pos.y+pos.x+pos.z*uWidth*uHeight];\n"
"        imageStore(uImage, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_nhwc_buffer_to_image_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uImage;\n"
"layout(binding=1) readonly buffer destBuffer{\n"
"    float data[];\n"
"} uInBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout(location = 4) uniform int uChannel;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color;\n"
"        int z = pos.z*4;\n"
"        color.r = uInBuffer.data[pos.y*uWidth*uChannel + pos.x*uChannel + (z+0)];\n"
"        color.g = uInBuffer.data[pos.y*uWidth*uChannel + pos.x*uChannel + (z+1)];\n"
"        color.b = uInBuffer.data[pos.y*uWidth*uChannel + pos.x*uChannel + (z+2)];\n"
"        color.a = uInBuffer.data[pos.y*uWidth*uChannel + pos.x*uChannel + (z+3)];\n"
"        imageStore(uImage, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_im2col_glsl = 
"layout(std430) buffer;\n"
"layout(binding=0, FORMAT) writeonly mediump uniform image2D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform ivec2 pad;\n"
"layout(location=3) uniform ivec2 kernelSize;\n"
"layout(location=4) uniform ivec2 stride;\n"
"layout(location=5) uniform ivec2 dilate;\n"
"layout(location=6) uniform ivec4 inputSize;\n"
"layout(location=7) uniform ivec4 outputSize;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"#define UP_DIV(x, y) (((x)+(y)-1)/(y))\n"
"//index : ib*ic/4, oh, ow\n"
"//input image ic/4, ih, iw * ic4\n"
"//inputsize : ic/4, ih, iw\n"
"//outputsize : oc/4, oh, ow\n"
"//output : temp image : (ib*oh*ow)/ 4, ic/4*ky*kx*(ib*oh*ow)%4*ic4\n"
"void main()\n"
"{\n"
"    ivec3 index = ivec3(gl_GlobalInvocationID);\n"
"    if (index.x < outputSize.x && index.y < outputSize.y)\n"
"    {\n"
"        ivec2 s0 = index.xy*stride-pad;\n"
"        ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, dilate)));\n"
"        ivec2 efxy = min(kernelSize, UP_DIV(inputSize.xy-s0, dilate));\n"
"        int ic_4 = index.z % inputSize.z; //input channel\n"
"        int ib = index.z / inputSize.z; // input batch\n"
"        \n"
"        int destYOrigin = ib*outputSize.x*outputSize.y + index.y*outputSize.x + index.x;\n"
"        int destY = destYOrigin / 4;\n"
"        int destXOffset = destYOrigin % 4;\n"
"        for (int fy=0; fy<kernelSize.y; ++fy)\n"
"        {\n"
"            int sy = fy*dilate.y + s0.y;\n"
"            for (int fx=0; fx<kernelSize.x; ++fx)\n"
"            {\n"
"                int sx = fx*dilate.x + s0.x;\n"
"                int destX = fx + fy*kernelSize.x + ic_4*kernelSize.x * kernelSize.y;\n"
"                vec4 color = texelFetch(uInput, ivec3(sx, sy, index.z), 0);\n"
"                imageStore(uOutput, ivec2(4*destX+destXOffset, destY), color);\n"
"            }\n"
"        }\n"
"    }\n"
"}\n"
;
const char* glsl_nchw_buffer_to_image_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uImage;\n"
"layout(binding=1) readonly buffer destBuffer{\n"
"    float data[];\n"
"} uInBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color;\n"
"        int z = pos.z*4;\n"
"        color.r = uInBuffer.data[uWidth*pos.y+pos.x+(z+0)*uWidth*uHeight];\n"
"        color.g = uInBuffer.data[uWidth*pos.y+pos.x+(z+1)*uWidth*uHeight];\n"
"        color.b = uInBuffer.data[uWidth*pos.y+pos.x+(z+2)*uWidth*uHeight];\n"
"        color.a = uInBuffer.data[uWidth*pos.y+pos.x+(z+3)*uWidth*uHeight];\n"
"        imageStore(uImage, pos, color);\n"
"    }\n"
"}\n"
;
const char* glsl_image_to_nhwc_buffer_glsl = 
"layout(FORMAT, binding=0) readonly uniform PRECISION image3D uImage;\n"
"layout(binding=1) writeonly buffer destBuffer{\n"
"    float data[];\n"
"} uOutBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout(location = 4) uniform int uChannel;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color = imageLoad(uImage, pos);\n"
"        int z = pos.z*4;\n"
"        uOutBuffer.data[pos.y*uWidth*uChannel+pos.x*uChannel+(z+0)] = color.r;\n"
"        uOutBuffer.data[pos.y*uWidth*uChannel+pos.x*uChannel+(z+1)] = color.g;\n"
"        uOutBuffer.data[pos.y*uWidth*uChannel+pos.x*uChannel+(z+2)] = color.b;\n"
"        uOutBuffer.data[pos.y*uWidth*uChannel+pos.x*uChannel+(z+3)] = color.a;\n"
"    }\n"
"}\n"
;
const char* glsl_image_to_nc4hw4_buffer_glsl = 
"layout(FORMAT, binding=0) readonly uniform PRECISION image3D uImage;\n"
"layout(std430, binding=1) writeonly buffer destBuffer{\n"
"    vec4 data[];\n"
"} uOutBuffer;\n"
"layout(location = 2) uniform int uWidth;\n"
"layout(location = 3) uniform int uHeight;\n"
"layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n"
"void main()\n"
"{\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    if (pos.x < uWidth && pos.y < uHeight)\n"
"    {\n"
"        vec4 color = imageLoad(uImage, pos);\n"
"        uOutBuffer.data[uWidth*pos.y+pos.x+pos.z*uWidth*uHeight] = color;\n"
"    }\n"
"}\n"
;
const char* glsl_softmaxHeight_glsl = 
"layout(FORMAT, binding=0) writeonly uniform PRECISION image3D uOutput;\n"
"layout(location=1) uniform mediump sampler3D uInput;\n"
"layout(location=2) uniform int w;\n"
"layout(location=3) uniform int h;\n"
"layout(location=4) uniform int c;\n"
"layout (local_size_x = XLOCAL, local_size_y = YLOCAL, local_size_z = ZLOCAL) in;\n"
"void main()\n"
"{\n"
"    // input tensor's layout is NC4HW4\n"
"    ivec3 pos = ivec3(gl_GlobalInvocationID);\n"
"    \n"
"    int channelDiv4 = (c + 3) / 4;\n"
"    int HW = w * h;\n"
"    \n"
"    if(pos.x < w && pos.z < channelDiv4)\n"
"    {\n"
"        // get the max value\n"
"        vec4 maxValue = vec4(-1000.0);\n"
"        for(int i = 0; i < h; ++i)\n"
"        {\n"
"            maxValue = max(maxValue, texelFetch(uInput, ivec3(pos.x, i, pos.z), 0));\n"
"        }\n"
"        \n"
"        // sum\n"
"        vec4 sum = vec4(0.0);\n"
"        for(int i = 0; i < h; ++i)\n"
"        {\n"
"            sum += exp(texelFetch(uInput, ivec3(pos.x, i, pos.z), 0) - maxValue);\n"
"        }\n"
"        // div\n"
"        for(int i = 0; i < h; ++i)\n"
"        {\n"
"            ivec3 curPos = ivec3(pos.x, i, pos.z);\n"
"            imageStore(uOutput, curPos, exp(texelFetch(uInput, curPos, 0) - maxValue) / sum);\n"
"        }\n"
"        \n"
"    }\n"
"}\n"
;
